In [21]:
import numpy as np
import pandas as pd

train_data = pd.read_csv('./titanic/train.csv')
test_data = pd.read_csv('./titanic/test.csv')

Clearn Null Data


In [30]:
train_data['Age'].astype(np.float64)
train_data['Gender'] = train_data['Sex'].map({'female':0, 'male':1}).astype(int)

median_ages = np.zeros((2,3))
for i in range(2):
    for j in range(3):
        median_ages[i,j] = train_data[
            (train_data['Gender']==i)&(train_data['Pclass']==j+1)
        ]['Age'].dropna().median()
        
train_data['AgeFill'] = train_data['Age']
for i in range(2):
    for j in range(3):
        train_data.loc[
            (train_data.Age.isnull())&(train_data['Gender']==i)&(train_data['Pclass']==j+1),'AgeFill'
        ] = median_ages[i,j]
train_data['AgeIsNull'] = pd.isnull(train_data.Age).astype(int)

Feature Engineering


In [31]:
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch']
train_data['Age*class'] = train_data['AgeFill'] * train_data['Pclass']

Drop redundant feature


In [37]:
train_data = train_data.drop(['PassengerId','Name','Sex','Ticket','Cabin','Embarked'],axis=1)
train_data = train_data.drop(['Age'],axis=1)

Test Data Preparation


In [33]:
test_data['Gender'] = test_data['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

median_ages = np.zeros((2,3))
for i in range(0, 2):
    for j in range(0, 3):
        median_ages[i,j] = test_data[(test_data['Gender'] == i) & \
                              (test_data['Pclass'] == j+1)]['Age'].dropna().median()
test_data['AgeFill'] = test_data['Age']
for i in range(0, 2):
    for j in range(0, 3):
        test_data.loc[ (test_data.Age.isnull()) & (test_data.Gender == i) & (test_data.Pclass == j+1),\
                'AgeFill'] = median_ages[i,j]

test_data['AgeIsNull'] = pd.isnull(test_data.Age).astype(int)

test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch']
test_data['Age*Class'] = test_data.AgeFill * test_data.Pclass

test_data = test_data.drop(['PassengerId','Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1)
test_data = test_data.drop(['Age'], axis=1)

faremedian = test_data['Fare'].dropna().median()
test_data.loc[test_data.Fare.isnull(),'Fare'] = faremedian

Random Forest


In [44]:
X_train = train_data.values
X_train, y_train = X_train[:,1:], X_train[:,0]
X_test = test_data.values

In [47]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100).fit(X_train,y_train)
y_pred = forest.predict(X_test)

Save as CSV


In [80]:
result = pd.read_csv('./titanic/gender_submission.csv')
result['Survived'] = y_pred.astype(int)
result.to_csv('./titanic/submissions.csv',index=False)

In [ ]: